ZINB fit to get true parameters

data <- read.table("~/Documents/BRAIN/gitrepo/zinb_analysis/sims/datasets/expression_mRNA_17-Aug-2014.txt", sep='\t', stringsAsFactors = FALSE, comment.char = '%')

level1 <- as.factor(as.matrix(data)[9,-(1:2)])
tissue <- as.factor(as.matrix(data)[1,-(1:2)])
group <- as.factor(as.matrix(data)[2,-(1:2)])
nmolecule <- as.numeric(as.matrix(data)[3,-(1:2)])
sex <- as.factor(as.matrix(data)[5,-(1:2)])
level2 <- as.factor(as.matrix(data)[10,-(1:2)])

counts <- as.matrix(data[12:NROW(data),-(1:2)])
counts <- matrix(as.numeric(counts), ncol=ncol(counts), nrow=nrow(counts))
rownames(counts) <- data[12:NROW(data),1]
colnames(counts) <- data[8, -(1:2)]

filter = sample(1:ncol(counts), 300, replace = F)
counts = counts[, filter]
group = droplevels(group[filter])
sex = sex[filter]
tissue = tissue[filter]
level1 = droplevels(level1[filter])
level2 = level2[filter]
filterGenes = apply(counts > 5, 1, sum) >= 5
counts <- counts[filterGenes, ]

#vars <- rowVars(log1p(counts))
#names(vars) <- rownames(counts)
#vars <- sort(vars, decreasing = TRUE)
#core <- counts[names(vars)[1:1000],]
core <- counts[sample(1:nrow(counts), 1000),]
sum(core == 0)/(ncol(core) * nrow(core))
## [1] 0.45793
col1 <- brewer.pal(9, "Set1")
col2 <- brewer.pal(8, "Set2")
colTissue <- col1[tissue]
colMerged <- col2[level1]

We use Zeisel dataset with all cells and only 1000 randomly selected genes. The dimensions are

dim(core)
## [1] 1000  300
table(tissue)
## tissue
## ca1hippocampus       sscortex 
##            136            164
table(tissue, group)
##                 group
## tissue            1  2  3  4  5  6  7  8  9
##   ca1hippocampus 16  2 92 10  2  4  5  1  4
##   sscortex       17 36  1 69  8 15 13  0  5
table(tissue, sex)
##                 sex
## tissue            -1   0   1
##   ca1hippocampus  84   1  51
##   sscortex        62   0 102
table(level1)
## level1
## astrocytes_ependymal    endothelial-mural         interneurons 
##                   19                   28                   33 
##            microglia     oligodendrocytes        pyramidal CA1 
##                   10                   79                   94 
##         pyramidal SS 
##                   37

Colors in left panel correspond to ? . Colors in right panel correspond to ?.

par(mfrow=c(1, 2))
fq <- EDASeq::betweenLaneNormalization(core, which="full")
pcafq <- prcomp(t(log1p(fq)))
plot(pcafq$x, col=colMerged, pch=20, main="PCA of FQ log-counts, centered not scaled")
plot(pcafq$x, col=colTissue, pch=20, main="PCA of FQ log-counts, centered not scaled")

Fit data with K = 2, V and X intercepts in both Mu and Pi, commondispersion = FALSE, and no covariate.

print(system.time(zinb <- zinbFit(core, ncores = 3, K = 2,
                                  commondispersion = FALSE)))
##    user  system elapsed 
## 193.585  78.733 123.975

Check correlations between \(W\), \(\gamma_{mu}\), \(\gamma_{pi}\), detection rate, and coverage.

\(W_1\) is correlated with \(\gamma_{\mu}\) but not \(\gamma_{\pi}\).
\(\gamma_{\mu}\) and \(\gamma_{\pi}\) are correlated.
\(\gamma_{\mu}\) is correlated with coverage.
\(\gamma_{\pi}\) is correlated with detection rate.

In what follow, we assume that \(W\) and \(\gamma_{\pi}\) are independent. We simulate \(W\), \(\gamma_{\pi}\), and \(\gamma_{\mu}\) separetely.

detection_rate <- colMeans(core>0)
coverage <- colSums(core)

par(mfrow = c(1,1))
plot(zinb@W, col = colMerged, pch = 19, xlab = 'W1', ylab = 'W2', main = 'W')

df <- data.frame(W1 = zinb@W[,1], W2 = zinb@W[,2],
                 detection_rate = detection_rate,
                 coverage = coverage)
pairs(df, pch=19, col=colMerged)

print(cor(df, method="spearman"))
##                         W1          W2 detection_rate   coverage
## W1              1.00000000 -0.01421971      0.7699318  0.6473462
## W2             -0.01421971  1.00000000     -0.1120463 -0.1601473
## detection_rate  0.76993179 -0.11204633      1.0000000  0.9336435
## coverage        0.64734621 -0.16014731      0.9336435  1.0000000
df <- data.frame(W1 = zinb@W[,1], W2 = zinb@W[,2],
                 gamma_mu = zinb@gamma_mu[1, ],
                 gamma_pi = zinb@gamma_pi[1, ])
pairs(df, pch=19, col=colMerged)

print(cor(df, method="spearman"))
##                   W1          W2    gamma_mu   gamma_pi
## W1        1.00000000 -0.01421971  0.46683363 -0.6656167
## W2       -0.01421971  1.00000000  0.01184591  0.1712210
## gamma_mu  0.46683363  0.01184591  1.00000000 -0.6436672
## gamma_pi -0.66561673  0.17122101 -0.64366715  1.0000000
df <- data.frame(W1 = zinb@W[,1], W2 = zinb@W[,2],
                 gamma_mu = zinb@gamma_mu[1, ],
                 gamma_pi = zinb@gamma_pi[1, ],
                 detection_rate = detection_rate,
                 coverage = coverage)
pairs(df, pch=19, col=colMerged)

print(cor(df, method="spearman"))
##                         W1          W2    gamma_mu   gamma_pi
## W1              1.00000000 -0.01421971  0.46683363 -0.6656167
## W2             -0.01421971  1.00000000  0.01184591  0.1712210
## gamma_mu        0.46683363  0.01184591  1.00000000 -0.6436672
## gamma_pi       -0.66561673  0.17122101 -0.64366715  1.0000000
## detection_rate  0.76993179 -0.11204633  0.81177969 -0.9248605
## coverage        0.64734621 -0.16014731  0.92242738 -0.8189961
##                detection_rate   coverage
## W1                  0.7699318  0.6473462
## W2                 -0.1120463 -0.1601473
## gamma_mu            0.8117797  0.9224274
## gamma_pi           -0.9248605 -0.8189961
## detection_rate      1.0000000  0.9336435
## coverage            0.9336435  1.0000000

Additionally, let’s check \(\alpha\) and \(\beta\).

df <- data.frame(alpha_mu_1 = zinb@alpha_mu[1, ],
                 alpha_mu_2 = zinb@alpha_mu[2, ],
                 alpha_pi_1 = zinb@alpha_pi[1, ],
                 alpha_pi_2 = zinb@alpha_pi[2, ])
pairs(df, pch=19, main = 'alpha')

print(cor(df, method="spearman"))
##             alpha_mu_1  alpha_mu_2 alpha_pi_1 alpha_pi_2
## alpha_mu_1  1.00000000  0.01469818 -0.4629298 -0.1623635
## alpha_mu_2  0.01469818  1.00000000 -0.2132317 -0.4163138
## alpha_pi_1 -0.46292977 -0.21323174  1.0000000  0.2318247
## alpha_pi_2 -0.16236350 -0.41631384  0.2318247  1.0000000
df <- data.frame(beta_mu = zinb@beta_mu[1, ],
                 beta_pi = zinb@beta_pi[1, ])
pairs(df, pch=19, main = 'Beta')

print(cor(df, method="spearman"))
##            beta_mu    beta_pi
## beta_mu  1.0000000 -0.3725573
## beta_pi -0.3725573  1.0000000

Pick number of cells to simulate. Here

ncells = 100
ncells
## [1] 100

W and gamma

We simulate \(W\), \(\gamma_{\mu}\), and \(\gamma_{\pi}\) from three multivariate gaussians.

df <- data.frame(W1 = zinb@W[,1], W2 = zinb@W[,2],
                 gamma_mu = zinb@gamma_mu[1, ],
                 gamma_pi = zinb@gamma_pi[1, ])
pairs(df, pch=19, col=colMerged)

print(cor(df, method="spearman"))
##                   W1          W2    gamma_mu   gamma_pi
## W1        1.00000000 -0.01421971  0.46683363 -0.6656167
## W2       -0.01421971  1.00000000  0.01184591  0.1712210
## gamma_mu  0.46683363  0.01184591  1.00000000 -0.6436672
## gamma_pi -0.66561673  0.17122101 -0.64366715  1.0000000
# mclust
mclust = Mclust(df, G = 3)
pairs(mclust$data, col = mclust$classification)

# multivar gaussian
clust = sample(mclust$classification, ncells)
sim = lapply(clust, function(i){
  mvrnorm(n = 1, mu = mclust$parameters$mean[, i], 
          Sigma = mclust$parameters$variance$sigma[,, i])
})
sim = do.call(rbind, sim)
bio = clust
pairs(sim, col = bio)

print(cor(sim, method="spearman"))
##                  W1          W2    gamma_mu   gamma_pi
## W1        1.0000000 -0.12055206  0.50000600 -0.6895890
## W2       -0.1205521  1.00000000 -0.04418842  0.2045245
## gamma_mu  0.5000060 -0.04418842  1.00000000 -0.5445305
## gamma_pi -0.6895890  0.20452445 -0.54453045  1.0000000
par(mfrow = c(1, 1))
sim_W = sim[, 1:2]
sim_gamma_mu = sim[, 3]
sim_gamma_pi = sim[, 4]

Simulate counts

Original gamma_pi

Let’s simulate counts from the simulated \(W\), \(\gamma_{\mu}\), and \(\gamma_{\pi}\). Zero fraction of the simulated counts is

mod = zinbModel(W=sim_W, gamma_mu = matrix(sim_gamma_mu, nrow = 1),
                gamma_pi = matrix(sim_gamma_pi, nrow = 1),
                alpha_mu=zinb@alpha_mu, alpha_pi=zinb@alpha_pi,
                beta_mu=zinb@beta_mu, beta_pi=zinb@beta_pi, zeta = zinb@zeta)
sim = zinbSim(mod, seed = 8746)
core = t(sim$counts)
dim(core)
## [1] 1000  100
sum(core == 0)/(nrow(core) * ncol(core))
## [1] 0.46756
fq <- betweenLaneNormalization(core, which="full")
pca <- prcomp(t(log1p(fq)))
plot(pca$x, col=clust, pch=19, main="PCA of log-counts")

Changing gamma_pi

We want three different zero fractions (25%, 50%, 75%).

gammaOffset = c('25' = -3.2, '50' = .3, '75' = 2.7)
gammaPi = sapply(gammaOffset, function(x){
  sim_gamma_pi + x
})
ggplot(melt(gammaPi), aes(x = factor(X2), y = value)) + theme_bw() +
  geom_boxplot() + xlab('zero fraction') +
  theme_bw() + ylab('gamma_pi')

models = lapply(1:3, function(i){
  zinbModel(W=sim_W, gamma_mu = matrix(sim_gamma_mu, nrow = 1),
            gamma_pi = matrix(gammaPi[,i], nrow = 1),
            alpha_mu=zinb@alpha_mu, alpha_pi=zinb@alpha_pi,
            beta_mu=zinb@beta_mu, beta_pi=zinb@beta_pi, zeta = zinb@zeta)
})
## check
for (i in 1:3){
  mm = models[[i]]
  cc = data.frame(mm@W[,1], mm@W[,2], mm@gamma_mu[1,], mm@gamma_pi[1,])
  print(cor(cc, method="spearman"))  
}
##                   mm.W...1.   mm.W...2. mm.gamma_mu.1... mm.gamma_pi.1...
## mm.W...1.         1.0000000 -0.12055206       0.50000600       -0.6895890
## mm.W...2.        -0.1205521  1.00000000      -0.04418842        0.2045245
## mm.gamma_mu.1...  0.5000060 -0.04418842       1.00000000       -0.5445305
## mm.gamma_pi.1... -0.6895890  0.20452445      -0.54453045        1.0000000
##                   mm.W...1.   mm.W...2. mm.gamma_mu.1... mm.gamma_pi.1...
## mm.W...1.         1.0000000 -0.12055206       0.50000600       -0.6895890
## mm.W...2.        -0.1205521  1.00000000      -0.04418842        0.2045245
## mm.gamma_mu.1...  0.5000060 -0.04418842       1.00000000       -0.5445305
## mm.gamma_pi.1... -0.6895890  0.20452445      -0.54453045        1.0000000
##                   mm.W...1.   mm.W...2. mm.gamma_mu.1... mm.gamma_pi.1...
## mm.W...1.         1.0000000 -0.12055206       0.50000600       -0.6895890
## mm.W...2.        -0.1205521  1.00000000      -0.04418842        0.2045245
## mm.gamma_mu.1...  0.5000060 -0.04418842       1.00000000       -0.5445305
## mm.gamma_pi.1... -0.6895890  0.20452445      -0.54453045        1.0000000
pal = colorRampPalette(c("black","black", "red","yellow"), space="rgb")
par(mfrow=c(1, 3))
for (k in 1:3){
  smoothScatter(colMeans(log1p(getMu(models[[k]]))), 
                colMeans(getPi(models[[k]])), nbin = 256,
                nrpoints = Inf, pch = "", cex = .7, xlim = c(0,8),
                xlab = "Average simulated log Mu", main = names(gammaOffset)[k],
                ylab = "Average simulated Pi",ylim = c(0,1),
                colramp = pal)
}

10 replicates

B = 10
bio = clust
sim = lapply(1:3, function(k){
  simModel = models[[k]]
  simData = lapply(seq_len(B), function(i){
    zinbSim(simModel, seed = i*k)
  })
  fileName = sprintf("./datasets/simZeisel%s.rda", names(gammaOffset[k]))
  save(bio, simModel, simData, file = fileName)  
  sapply(simData, function(x) x$zeroFraction)
})
zeroFrac = data.frame(do.call(cbind, sim), stringsAsFactors = F)
colnames(zeroFrac) = names(gammaOffset)
ggplot(melt(zeroFrac), aes(x = variable, y = value)) + xlab('') + theme_bw() +
  geom_boxplot() + xlab('zero fraction') +
  theme_bw() + ylab('simulated zero fraction')

Let’s look at one simulated dataset for each zero fraction.

par(mfrow=c(1, 3))
for (k in 1:3){
  load(sprintf("./datasets/simZeisel%s.rda", names(gammaOffset[k])))
  counts = simData[[1]]$counts
  zero_rate <- rowMeans(counts == 0)
  plot(rowMeans(getPi(models[[k]])), zero_rate, main = names(gammaOffset)[k],
     xlab="Average simulated Pi", ylab="Simulated Zero Rate",
     pch=19, col=bio, ylim = c(0, 1), xlim = c(0,1))
  abline(a=0,b=1,col='gray')
}

par(mfrow=c(1, 3))
for (k in 1:3){
  load(sprintf("./datasets/simZeisel%s.rda", names(gammaOffset[k])))
  counts = simData[[1]]$counts
  coverage <- rowSums(counts)
  plot(rowMeans(log1p(getMu(models[[k]]))), log1p(coverage), xlim = c(0, 3),
     xlab="Average simulated log Mu", ylab="log Coverage", pch=19,
     col=bio,ylim = c(3,10), main = names(gammaOffset)[k])
}